Test outliers
# source functions source to get the
# load_filtered_micro_level function to get clr of RA
source(paste0(dir, "Code/5_29_Generate_filtered_Data_Microbiome.R"))
# clean and transform transcriptome data, subset of genes
source(paste0(dir, "Code/6_5_clean_transcriptome.R"))
# clinical data
source(paste0(dir, "Code/6_5_clean_clinical.R"))
# diagnostic plots and tables
source(paste0(dir, "Code/ref_plots.R"))
# outliers
source(paste0(dir, "Code/outliers.R"))
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "No gene list provided, will use the whole Transcriptome"
# wrappers
source(paste0(dir, "Code/wrappers.R"))
# run smccnet
source(paste0(dir, "Code/put_together.R"))
########## Datasets ############# phenotype contains ID ####
clin <- rescaled_cli()
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "No gene list provided, will use the whole Transcriptome"
# where na in LPS
n_na <- which(is.na(clin$LPS))
sum(is.na(clin$CD14))
## [1] 0
# LPS and CD14 as dataframe
LPS <- clin %>% select(LPS)
CD14 <- clin %>% select(CD14)
##### Transcriptome ######
rna_isgs <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/coreISG"))) %>%
rescaled_rna(., rlog = T)
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"
# the isgs data
isgs_rlog <- rna_isgs[[1]]
# names
colnames(isgs_rlog) <- rna_isgs[[2]]$Symbol
rna_genesbeta <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/genesbeta"))) %>%
rescaled_rna(., rlog = T)
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"
# the isgs data
genesbeta_rlog <- rna_genesbeta[[1]]
# names
colnames(genesbeta_rlog) <- rna_genesbeta[[2]]$Symbol
######## microbiome ###### Microbiome
micro_data <- load_filtered_micro_level_samples("genus", prevalence = 40,
RA = 2, wd = "Ubuntu")
## Found "Unclassified" category in input data
## Created new "Other" category.
## Converted 35400 counts to "Other" otu category.
## Remaining OTUS: 270 (Including "Other").
##
## Prevalence cutoff: 40% (i.e., at least 40% of libaries must be represented to keep OTU)
## Found 'Other' category in input data.
## Collapsed 195 OTUs to 'Other' in OTU table.
## Converted 80457 counts to 'Other' in OTU table.
## Remaining OTUs: 75 (Including 'Other').
##
## Relative abundance cutoff: 2 % (i.e., at least one library must have RA > 2 % to keep OTU).
## Found "Other" category in input data.
## Collapsed 21 OTUs to "Other" otu category.
## Converted 47123 counts to "Other" otu category.
## Remaining OTUS: 54 (Including "Other").
##
## Contains 27 subjects/libraries from Explicet OTU file.
names(micro_data)
## [1] "filtered_RA" "filtered_clr" "full_RA" "library_size"
micro_clr <- micro_data[[2]] %>% as.data.frame()
# rescale to mean 0 and variance 1
mibi <- rescale_microbiome(micro_clr)
######### outliers in clinical parameter ##############
# for lps
dot_groupby(clin, clin$LPS, clin$Group, 0.8, "HIV Status", "LPS")

dot_groupby(clin, clin$LPS, 1, 0.8, "The whole cohort", "LPS")

grubbs_wrapper(LPS[,1], 10)
## [1] "Outlier"
grubbs.test(LPS[,1], # a numeric vector for data values.
type = 10, # Integer value indicating test variant.
# 10 is a test for one outlier (side is detected automatically and can be reversed by opposite parameter).
# 11 is a test for two outliers on opposite tails, 20 is test for two outliers in one tail.
opposite = FALSE, # a logical indicating whether you want to check not the value with largest difference from the mean
two.sided = FALSE)
##
## Grubbs test for one outlier
##
## data: LPS[, 1]
## G = 2.69400, U = 0.69807, p-value = 0.04735
## alternative hypothesis: highest value 2.69404391214899 is an outlier
dot_groupby(clin, clin$CD14, clin$Group, 0.8, "HIV Status", "CD14")

dot_groupby(clin, clin$CD14, 1, 0.8, "The whole cohort", "CD14")

# one outlier
grubbs.test(CD14[,1],
type = 10)
##
## Grubbs test for one outlier
##
## data: CD14[, 1]
## G = 3.20800, U = 0.58895, p-value = 0.004239
## alternative hypothesis: highest value 3.2080477151983 is an outlier
# two opposite
grubbs.test(CD14[,1],
type = 11)
##
## Grubbs test for two opposite outliers
##
## data: CD14[, 1]
## G = 4.81220, U = 0.50123, p-value = 0.05031
## alternative hypothesis: -1.60419928217619 and 3.2080477151983 are outliers
# two ouliers
grubbs.test(CD14[,1],
type = 20)
##
## Grubbs test for two outliers
##
## data: CD14[, 1]
## U = 0.33938, p-value < 2.2e-16
## alternative hypothesis: highest values 2.37443012510193 , 3.2080477151983 are outliers
### test per group hiv ##
grubbs_wrapper(LPS[,1], 10, "hiv")
## [1] "Nonoutlier"
grubbs_wrapper(LPS[,1], 20, "hiv")
## 1
## "Nonoutlier"
grubbs_wrapper(CD14[,1], 10, "hiv")
## [1] "Outlier"
grubbs_wrapper(CD14[,1], 10, "control")
## [1] "Nonoutlier"
grubbs_wrapper(CD14[,1], 20, "hiv")
## 1
## "Outlier"
grubbs_wrapper(CD14[,1], 11, "hiv")
## [1] "Nonoutlier"
####### outlier in mibi ############
sapply(colnames(mibi)[1:10], FUN = function(x){
dot_groupby(mibi, mibi[,x], clin$Group, 0.8, "HIV Status",
paste("Rescaled-clr of", x) )
})










## Actin.Actin.Bifidobacterium Actin.Actin.Rhodococcus
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Actin.Corio.Collinsella Bacte.Bacte.Alistipes
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Bacte.Bacte.Bacteroides Bacte.Bacte.Barnesiella
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Bacte.Bacte.Parabacteroides Bacte.Bacte.Prevotella
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Bacte.Bacte.Prevotellaceae Bacte.Bacte.S24.7
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
sapply(colnames(mibi)[1:10], FUN = function(x){
dot_groupby(mibi, mibi[,x], 1, 0.8, "The whole cohort",
paste("Rescaled-clr of", x) )
})










## Actin.Actin.Bifidobacterium Actin.Actin.Rhodococcus
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Actin.Corio.Collinsella Bacte.Bacte.Alistipes
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Bacte.Bacte.Bacteroides Bacte.Bacte.Barnesiella
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Bacte.Bacte.Parabacteroides Bacte.Bacte.Prevotella
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
## Bacte.Bacte.Prevotellaceae Bacte.Bacte.S24.7
## data List,54 List,54
## layers List,2 List,2
## scales ? ?
## mapping List,2 List,2
## theme List,59 List,59
## coordinates ? ?
## facet ? ?
## plot_env ? ?
## labels List,2 List,2
###### outlier in isgs_rlog #########
sapply(colnames(isgs_rlog )[1:10], FUN = function(x){
dot_groupby(isgs_rlog , isgs_rlog[,x], clin$Group, 0.8, "HIV Status",
paste("Rescaled-rlog of", x) )
})










## LAP3 CD38 ETV7 NUB1 SLC38A5 TYMP TMSB10
## data List,246 List,246 List,246 List,246 List,246 List,246 List,246
## layers List,2 List,2 List,2 List,2 List,2 List,2 List,2
## scales ? ? ? ? ? ? ?
## mapping List,2 List,2 List,2 List,2 List,2 List,2 List,2
## theme List,59 List,59 List,59 List,59 List,59 List,59 List,59
## coordinates ? ? ? ? ? ? ?
## facet ? ? ? ? ? ? ?
## plot_env ? ? ? ? ? ? ?
## labels List,2 List,2 List,2 List,2 List,2 List,2 List,2
## EIF2AK2 PARP12 TNK2
## data List,246 List,246 List,246
## layers List,2 List,2 List,2
## scales ? ? ?
## mapping List,2 List,2 List,2
## theme List,59 List,59 List,59
## coordinates ? ? ?
## facet ? ? ?
## plot_env ? ? ?
## labels List,2 List,2 List,2
sapply(colnames(isgs_rlog )[1:10], FUN = function(x){
dot_groupby(isgs_rlog , isgs_rlog[,x], 1, 0.8, "The whole cohort",
paste("Rescaled-rlog of", x) )
})










## LAP3 CD38 ETV7 NUB1 SLC38A5 TYMP TMSB10
## data List,246 List,246 List,246 List,246 List,246 List,246 List,246
## layers List,2 List,2 List,2 List,2 List,2 List,2 List,2
## scales ? ? ? ? ? ? ?
## mapping List,2 List,2 List,2 List,2 List,2 List,2 List,2
## theme List,59 List,59 List,59 List,59 List,59 List,59 List,59
## coordinates ? ? ? ? ? ? ?
## facet ? ? ? ? ? ? ?
## plot_env ? ? ? ? ? ? ?
## labels List,2 List,2 List,2 List,2 List,2 List,2 List,2
## EIF2AK2 PARP12 TNK2
## data List,246 List,246 List,246
## layers List,2 List,2 List,2
## scales ? ? ?
## mapping List,2 List,2 List,2
## theme List,59 List,59 List,59
## coordinates ? ? ?
## facet ? ? ?
## plot_env ? ? ?
## labels List,2 List,2 List,2
############### subseting #############
# microbiome whole cohort
whole_outmibi <- apply(mibi, 2, FUN = function(x){
ifelse(grubbs_wrapper(x, 10) == "Nonoutlier", TRUE, FALSE)
})
#
hiv_outmibi <- apply(mibi, 2, FUN = function(x){
ifelse(grubbs_wrapper(x, 10, "hiv") == "Nonoutlier", TRUE, FALSE)
})
sum(whole_outmibi)
## [1] 38
sum(hiv_outmibi)
## [1] 40
# BOTH outlier
sum((whole_outmibi == FALSE) & (hiv_outmibi == FALSE) )
## [1] 12
# outlier as whole
sum((whole_outmibi == FALSE) & (hiv_outmibi == TRUE) )
## [1] 4
# BOTH outlier
which((whole_outmibi == TRUE) & (hiv_outmibi == FALSE) )
## Prote.Betaproteobacteria Prote.Epsil.Helicobacter
## 44 47
# test the one different between whole and per group
sapply(colnames(mibi)[44], FUN = function(x){
dot_groupby(mibi, mibi[,x], clin$Group, 0.8, "HIV Status",
paste("Rescaled-clr of", x) )
})

## Prote.Betaproteobacteria
## data List,54
## layers List,2
## scales ?
## mapping List,2
## theme List,59
## coordinates ?
## facet ?
## plot_env ?
## labels List,2
sapply(colnames(mibi)[44], FUN = function(x){
dot_groupby(mibi, mibi[,x], 1, 0.8, "The whole cohort",
paste("Rescaled-clr of", x) )
})

## Prote.Betaproteobacteria
## data List,54
## layers List,2
## scales ?
## mapping List,2
## theme List,59
## coordinates ?
## facet ?
## plot_env ?
## labels List,2
################ subsetting only by whole ##############
isgs_outlier <- apply(isgs_rlog, 2, FUN = function(x){
ifelse(grubbs_wrapper(x, 10) == "Nonoutlier", TRUE, FALSE)
})
dim(isgs_rlog)
## [1] 27 246
dim(isgs_rlog[, isgs_outlier])
## [1] 27 151
mibi_outlier <- apply(mibi, 2, FUN = function(x){
ifelse(grubbs_wrapper(x, 10) == "Nonoutlier", TRUE, FALSE)
})
dim(mibi)
## [1] 27 54
dim(mibi[, mibi_outlier])
## [1] 27 38
####### unweighted ###### with outliers
run_SmCCNet(isgs_rlog, mibi, LPS, 0.35, 0.1, 0.8, 0.9, NULL,
n_na)
## [1] "weights can be NULL or a length 3 vector"


## NULL

## NULL

## NULL
## [[1]]
## [1] 1 3 6 12 13 19 21 23 27 28 53 62 67 84 86 102 103
## [18] 122 126 127 129 145 150 151 158 168 187 189 191 192 198 203 205 206
## [35] 226 227 233 236 238 254
##
## [[2]]
## [1] 2 4 8 14 16 35 42 43 48 54 60 70 71 72 79 80 90
## [18] 91 93 96 99 107 108 111 115 117 118 119 121 123 132 152 154 160
## [35] 165 167 171 173 180 188 195 197 199 221 225 232 245 255
##
## [[3]]
## [1] 22 34 76 81 87 97 100 113 114 116 169 176 193 201 211 212 222
## [18] 282
# deleting all outliers
run_SmCCNet(isgs_rlog[, isgs_outlier], mibi[, mibi_outlier],
LPS, 0.05, 0.05, 0.8, 0.9, NULL, n_na)
## [1] "weights can be NULL or a length 3 vector"


## NULL

## NULL

## NULL

## NULL

## NULL
## [[1]]
## [1] 14 176
##
## [[2]]
## [1] 27 174
##
## [[3]]
## [1] 62 156
##
## [[4]]
## [1] 77 155
##
## [[5]]
## [1] 105 181
run_SmCCNet(isgs_rlog[, isgs_outlier], mibi[, mibi_outlier],
LPS, 0.6, 0.05, 0.8, 0.9, NULL, n_na)
## [1] "weights can be NULL or a length 3 vector"


## NULL

## NULL

## NULL
## [[1]]
## [1] 1 2 6 7 8 11 19 25 30 31 32 35 36 37 38 39 49
## [18] 51 56 57 58 60 62 67 68 69 72 73 74 76 78 79 84 85
## [35] 87 89 92 94 97 99 103 105 106 108 112 120 127 128 130 131 132
## [52] 133 136 139 140 149 151 156
##
## [[2]]
## [1] 3 4 10 12 13 14 17 18 20 22 29 33 34 40 43 44 52
## [18] 54 55 59 63 64 71 75 77 80 82 83 90 93 95 96 98 100
## [35] 104 107 114 117 118 119 121 122 123 124 134 137 138 141 143 144 146
## [52] 147 150 155
##
## [[3]]
## [1] 9 15 23 24 26 27 28 42 47 61 65 66 81 86 102 111 113
## [18] 116 126 135 145 148 181
apply(mibi, 2, FUN = function(x) {
})
## NULL